/*
NOTES TO DATA CLEANING:
	1. Create a log of issues with treatment assignments or IDs by date
	2. Import treatment assignment files (pre-adaptive) with with behavior and strata
	3. Import treatment assignments files (adaptive) with scrambled id and treatment arm 
		a. Merge with above
	4. Load raw survey data 
	5. Merge with behavior and treatment files
	6. Cleaning
		a. Check for duplicate IDs
		b. Check for response rates - differential attrition?
		c. Create a subset of data with completed interviews for analysis. 
		d. Rename and label the variables and values
		e. Check for outliers, inconsistencies, recodes
		f. Create dummy variables, where required
		g. Maintain a log of issues faced
	7. Merge with delivery reports for SMS to check compliance
	8. Define treatments, strata, outcomes
	9. Derived variables (possibly transfer to analysis code)
*/

clear all
set more off
capture log close


********************************************************************************
***1. Create a log of issues with treatment assignments or IDs by date 
********************************************************************************
/*

Date: 08/24/20 - 1. Additional numbers added as controls along with treatments to call on Aug 27 and 28
Date: 09/03/20 - 1. Scrambleid 6259 was interviewed twice (2nd sep and 23rd aug)
Date: 09/04/20 - 1. More reassignment of treatment arms to Arms 6-10 due to some typing error in the code
				 2. 12 observations with missing treatment assignment (scrambledid = 17467 13193  16578  2410  14055  8335  4445   10861  2695   4347   6467  10306)
Date: 09/08/20 - 1. 7 observations with missing treatment assignment (scrambledid = 9413   2797  17238  10956  14109  7819  6848)
Date: 09/15/20 - 1. 4 observations with incorrectly matched treatment assignment (or missing? - need to check) (scrambledid = 4085 6057 7197 13026)

*/

********************************************************************************
***2. Import treatment assignments files (pre-adaptive) with behavior and strata
********************************************************************************

* Load some old Scrambled IDs from the old master list that didn't make it into the final master list but were surveyed
use "$treatment/bihar_actualsample_trial_deidentified_complete_20Aug20", clear

keep if (scrambledid==8247 | scrambledid==10821 | scrambledid==11663 | scrambledid==16676 | scrambledid==18323 | scrambledid==2955 | scrambledid==2955 | scrambledid==3679 | scrambledid==5019 | scrambledid==5777 | scrambledid==7262 | scrambledid==8439 | scrambledid==9482 | scrambledid==10054 | scrambledid==10129 | scrambledid==10423 | scrambledid==10674 | scrambledid==11878 | scrambledid==11942 | scrambledid==12251 | scrambledid==13819 | scrambledid==14254 | scrambledid==14597 | scrambledid==14685 | scrambledid==15025 | scrambledid==15587 | scrambledid==16542 | scrambledid==17499 | scrambledid==17622 | scrambledid==17934 | scrambledid==18172)

tempfile old_master	
save `old_master', replace

* Cannot find scrambled id 14597 anywhere in the master file archive, even though we have an observation for that ID in the survey data

use "$treatment/bihar_actualsample_trial_deidentified_complete_25Sept20", clear

* Merge in missing block and child_uid data (do this before merging in the left-behind IDs from above)
merge 1:1 scrambledid using "$treatment/missingblocks15Dec20", keepusing(block_complete child_uid_complete)

assert block==block_complete if !mi(block)
replace block = block_complete if mi(block)
assert !mi(block)

*assert child_uid==child_uid_complete if !mi(child_uid)
*replace child_uid = child_uid_complete if mi(child_uid)
*assert !mi(child_uid)

drop block_complete child_uid_complete

* Append the left-behind IDs
append using `old_master'

assert !mi(block)  // ensure that the block assertion is still met after appending the left-behind IDs
***** assert !mi(child_uid) REMOVED BECAUSE NOT KEEPING CHILD UID IN CASE IDENTIFIABLE

gisid scrambledid 
order scrambledid backupid pairnumber, first

foreach v of varlist dateofdelivery-idate {
	tab `v', mi
}

tab behavior if missing(dateofdelivery) //All missing values are control groups as expected
assert behavior == 9 if missing(dateofdelivery)
tab arm dateofdelivery, mi
tab arm if missing(dateofdelivery) 

rename statename state
la var state "State"
la var block "Block"
la var highlit "High literacy block"
rename highstsc highscst
la var highscst "High SC/ST block"
	la list highstsc
	la def highstsc 0"Low SCST" 1"High SCST", modify
	la list stratum
	la def stratum 1 "Low SCST Low literacy" 2 "Low SCST High literacy" 3 "High SCST Low literacy" 4 "High SCST High literacy", modify
	
la var scrambledid "Unique ID"
rename firstbatch samplebatch
la var samplebatch "Sample batch"
la var behavior "Treatment target behavior and control"
la var behavior_interview "Behavior - treatment and control"
note behavior_interview: "The behavior for control group is used for the list experiment"
la var idate "Date of submission (sample)"

count //13382

*Keep only relevant variables
keeporder scrambledid backupid pairnumber samplebatch dateofdelivery dateofinterview ///
		day behavior behavior_interview arm ///
		state block centertype stratum highlit highscst idate 

tempfile sample
save `sample', replace

********************************************************************************
***3. Import treatment assignments files (adaptive) with scrambled id and treatment arm
********************************************************************************

cd "$treatment"

* STEP 1) EXTRACT INITIAL TREATMENTS FROM MASTER LIST
use "$treatment/bihar_actualsample_trial_deidentified_complete_25Sept20", clear

keep scrambledid arm
decode arm, gen(treatment)
drop if missing(treatment)
drop arm
gen treatmentfilename = "initial_assignment_0816"

tempfile initial_assignments
save `initial_assignments', replace

* STEP 2) EXTRACT DAILY SMS ASSIGNMENTS
local filelist: dir . files "sms_assignments_*.dta"
local j = 1
foreach file in `filelist' {
	tempfile file`j'
	use `file', clear
	cap decode treatment, gen(temp)
	cap drop treatment
	cap rename temp treatment
	gen treatmentfilename = "`file'"
	save `file`j''
	local ++ j
}
clear

local j = `j' - 1
	forval i = 1/`j' {
	append using `file`i''
}

* STEP 3) ADD INITIAL TREATMENTS
append using `initial_assignments'

la var treatmentfilename "File name for treatment assignment"

*Delete the old file from Sep 19 
drop if treatmentfilename=="sms_assignments_091920_OLD.dta" | treatmentfilename=="sms_assignments_091920_old.dta"

* STEP 4) FIX TYPE MISMATCH ISSUE WITH ASSIGNMENTS FILES ON 8/26 and 8/27
drop if treatmentfilename=="sms_assignments_082620.dta"
drop if treatmentfilename=="sms_assignments_082720.dta"

assert !missing(treatmentfilename) // Check that all obs have file names, to allow the next code chunk to run without introducing errors

*Add the assignments back in, and provide filenames
append using "sms_assignments_082620.dta"
replace treatmentfilename="sms_assignments_082620.dta" if treatmentfilename==""
append using "sms_assignments_082720.dta"
replace treatmentfilename="sms_assignments_082720.dta" if treatmentfilename==""
//ALL ASSIGNMENTS NOW IMPORTED

*Assign treatment values
	rename treatment treat_str
	gen treatment = .
	replace treatment = 1 if treat_str == "Neutral - twice morning"
	replace treatment = 2 if treat_str == "Public gain - twice morning"
	replace treatment = 3 if treat_str == "Public loss - twice morning"
	replace treatment = 4 if treat_str == "Private gain - twice morning"
	replace treatment = 5 if treat_str == "Private loss - twice morning"
	replace treatment = 6 if treat_str == "Neutral - morning/evening"
	replace treatment = 7 if treat_str == "Public gain - morning/evening"
	replace treatment = 8 if treat_str == "Public loss - morning/evening"
	replace treatment = 9 if treat_str == "Private gain - morning/evening"
	replace treatment = 10 if treat_str == "Private loss - morning/evening"
	drop treat_str

	la def treatment 0 "Control" ///
						1 "Neutral - twice morning" ///
						2 "Public gain - twice morning" 3 "Public loss - twice morning" ///
						4 "Private gain - twice morning" 5 "Private loss - twice morning" ///
						6 "Neutral - morning/evening" ///
						7 "Public gain - morning/evening" 8 "Public loss - morning/evening" ///
						9 "Private gain - morning/evening" 10 "Private loss - morning/evening" 
	la val treatment treatment
	la var treatment "Treatment arms"

count //9673

*Check for number of days of treatment assignment appended
	unique treatmentfilename //27 
		
*Check for unique observations
	gisid scrambledid 

*Fix August 23rd, where everyone was morning and evening (morning was delayed a bit to 11 a.m. instead of 7:30 a.m.)
	replace treatment = treatment + 5 if treatment <= 5 & treatmentfilename == "sms_assignments_082320.dta"

*Check if any treatment assignment is missing
	tab treatmentfilename if missing(treatment) //30 across 3 dates

	*Export missing treatment assignments to an excel file
	*export excel treatmentfilename scrambledid using "$logfiles\missingtreatment_`c(current_date)'" if missing(treatment), first(var) replace
		
	*Convert all missing treatments to control (and can confirm with SMS delivery reports as extra verification)
	replace treatment = 0 if missing(treatment) //30

	assert !missing(treatment)
	
*Drop observations that are not in the master sample
	//drop if inlist(scrambledid, 8247, 10821, 11663, 16676, 18323)
	
drop rand 

tempfile treatmentassignment
save `treatmentassignment', replace

***Merge sample with treatment assignment
	
	merge 1:1 scrambledid using `sample'
	//9673 matched
	//3709 unmatched (from using)
	
	tab scrambledid treatment if _merge==1
	tab treatment arm if _merge==3
	*Drop arm variable (since it holds treatment assignment only from first batch)
	drop arm
	
	gen sampleinfo = _merge==3
	la var sampleinfo "Observations matched between sample and treatment assignment"
	drop _merge

	***Save merged datafile
	tempfile treatment
	save `treatment', replace

********************************************************************************
*4. Load raw survey data
******************************************************************************** 

la def yn 1"Yes" 0"No"

local files : dir "$covidraw" files "*.dta"

local use use 
local clear clear
local gen "gen str80"
local if ""

	cd "$covidraw"
	foreach file in `files' {
	    
		`use' "`file'", `clear'
		`gen' surveyfilename = "`file'" `if'
		
		local use "append using"
		local clear force
		local gen replace
		local if "if surveyfilename == """
	}

	*drop if surveyfilename == "8.30.2020 copy.dta" // Priyal says this is the correct one actually.
		// She removed the other version from Dropbox to avoid further confusion
	gisid scramble_id starttime if !missing(scramble_id)
	la var surveyfilename "Survey raw data file name"
	
**Checks exported from treatment assignment file for certain observations
	*note; CHECK WITH DEV ABOUT THESE OBSERVATIONS (keeping the duplicate drop below, but otherwise no drops)
//	drop if key == "uuid:43a26fa9-cc26-45e1-a06b-4fea97cd5376" // Duplicate
	/*
	drop if key == "uuid:5a11e8ae-e521-4e12-aced-2abe08116497" // One distancing person who is in a handwashing round
	drop if inlist(key, "uuid:60e3d4b2-5d11-4a4f-bc2f-ee6d5774db0b", "uuid:ecd3efec-825e-4a53-87cc-796cf7930b40", "uuid:3dfe9630-d1fd-473b-a9be-206431856080") // Handwashing person in distancing round
	drop if inlist(scramble_id, 758, 2955, 5030, 17665, 17934) // 8.30 interviews that don't match treatment assignments
	drop if inlist(scramble_id, 6259) // 9.02 interview that didn't match treatment assignments 
	drop if key == "uuid:650aa5bb-1bd3-4cab-98ee-dbcbf696d513" // Handwashing in a distance round
	drop if scramble_id == 10067 // control interviewed a day later on 9/11
	drop if inlist(scramble_id, 2138, 12983)  
	drop if inlist(scramble_id, 4085, 6057, 7197, 13026)
	drop if inlist(scramble_id, 4675, 5419, 10418, 15371)
	*/
	
rename scramble_id scrambledid

*Check for number of days of treatment assignment appended
unique surveyfilename //49

*Drop irrelevant variables
*drop deviceid subscriberid simid devicephonenum caseid username calc_scram_ph_num calc_surv_num surv_num_oth from_number calc_resp_num calc_exo_ph_num 

*CALL STATUS / RESPONSE RATES
	
	la var call_status "Call status"
	note call_status: "Please select call status"
	la def call_status 1 "Picked Up" 2 "Cut after few rings" 3 "Number unreachable/switched off (Audio/Voice Recording)" ///
					   4 "Invalid number" 5 "Phone not in use" 6 "Wrong number" ///
					   7 "No incoming call facility on this number" 8 "Ringing but no answer" 9 "Asked to call later" ///
					   88 "other [specify]"
	la val call_status call_status
	tab call_status, mi
		
		*Create a dummy for call being picked up
		gen byte respond = call_status == 1 if ~missing(call_status)
		tab respond, mi
		la var respond "Call was picked up"
		la val respond yn
		order respond, a(call_status)
		
		tab call_status consent, mi
		
* Drop those who did not consent
//drop if consent == 2
		
*Check for duplicate entries if the call was picked up
//unique scrambledid if respond==1
//duplicates tag scrambledid if respond==1, gen(dupidrespond)
//tab dupidrespond, mi
//duplicates drop scrambledid if consent==2 & dupidrespond==1, force
//duplicates drop scrambledid if dupidrespond==1, force
//drop dupidrespond
*** Keep longest phone call when there are duplicates
*		bys scrambledid: egen longest = max(duration)
*		drop if longest != duration & !missing(scrambledid) & respond == 1 
*		gisid scrambledid if respond == 1

	la var call_status_others "Call status (Others)"
	note call_status_others: "Please specify others"
	tab call_status_others, mi
		replace call_status_others="Audio issues" if regexm(call_status_others, "awaj") | regexm(call_status_others, "Awaj") | ///
													 regexm(call_status_others, "awaz") | regexm(call_status_others, "Awaz")
		replace call_status_others="Number busy" if regexm(call_status_others, "Number busy") | regexm(call_status_others, "number busy") | ///
													regexm(call_status_others, "Number Busy") | ///
													regexm(call_status_others, "No busy") | regexm(call_status_others, "no busy") | ///
													regexm(call_status_others, "Busy bata") 
	note call_status_others: "FLAG: still needs to be clean"
													 
	la var call_later "Appointment time for calling later"
	note call_later: "Note down the appointment time"

	***CONSENT
	la var consent "Consent given"
	note consent: "Do you give us consent to proceed with the survey?"
	la def consent 1 "Yes" 0 "No"
	la val consent consent
	recode consent (2=0)
	la val consent yn
	tab consent, mi
	tab consent if respond==1
	//majority consented

	la var consent_c "Comments:"
	note consent_c: "Comments:"	
	
***DATE AND TIME FOR SURVEY
	cap drop actualsurvey0 actualsurveydate
	gen actualsurvey0 = substr(time_01,1,11)
	gen double actualsurveydate = date(actualsurvey0, "YMD")
	format actualsurveydate %td
	drop actualsurvey0
	la var actualsurveydate "Actual date of interview"
	
	* Survey Date & Time
	* (NOTE: this is pulled from time_01 because it has consistent string formatting for conversion, unlike some other time variables.
	* And it is also available for every observation, regardless of whether the respondent answered the phone)
	gen double actualdatetime = clock(time_01, "YMD hms")
	format actualdatetime %tc
	la var actualdatetime "Date and Time in Stata Format"

	
	
	
***DURATION FOR EACH MODULE
	forval i = 1/7 {
		cap drop module_`i'0 module_`i'
		gen module_`i'0 = substr(time_0`i',13,8)
		replace module_`i'0 = substr(time_0`i',12,8) if day(actualsurveydate) < 10
		gen module_`i' = clock(module_`i'0, "hms")
		format module_`i' %tc
		drop module_`i'0
		la var module_`i' "Start time for Module `i'"
	}
	
	gen duration_1 = minutes(module_2-module_1)
	gen duration_2 = minutes(module_3-module_2)
	gen duration_3 = minutes(module_4-module_3)
	gen duration_4 = minutes(module_5-module_4)
	gen duration_5 = minutes(module_6-module_5)
	gen duration_6 = minutes(module_7-module_6)
	
	forval i = 1/6 {
		la var duration_`i' "Time taken to complete Module `i' (in minutes)"
	}

***Create a log of number of attempts per scrambledid
	
	/*-- Scrambled ID successfully interviewed more than once --
	Create checks for this issue so that we don't double-count respondents.
	(currently not an issue but we should be robust against this, especially if we recycle treatment assignments
	that we believe were never actually called)*/
	
	*Log number of attempts and attempt number for each call
	sort scrambledid actualdatetime 
	by scrambledid: gen attempts = _N
	by scrambledid: gen attemptnum = _n
	bys scrambledid: egen longestcall = max(duration)
	
	summ attempts 
	la var attempts "Total number of interview attempts made"
	//On average, a number was called 1.7 times, min: 1 and max: 32
	
	*Count consents within ID
	bysort scrambledid: egen total_consent = sum(consent)
	tab total_consent if attemptnum==1
	
	*Make sure we don't have any scrambled IDs that have consented twice
	*(If so, look into whether it was a survey that was interrupted and completed at a later time.
	*That would require manual cleaning but should be OK once cleaned)
	*assert total_consent==0 | total_consent==1

**** Something weird with a few responses, need to investigate, see first few rows of this:
tab submissiondate 
drop if missing(scrambledid) // won't be needed once raw data is cleaned
	// [Call set up with Priyal to investigate root cause]
	// Look for example at 10/17 submission date data. Maybe problematic punctuation in a response
	// NOTE: This error doesn't persist through the end of cleaning, but I don't want any missing entries so need to resolve with Priyal
	// (The error persists as far back as the RAW data, so probably need to fix in the SurveyCTO export)
	
***Collapse all data into the most recent entry to have one row per observation if the call was not picked up
	
	* Keep one entry per scrambled ID:
		/*
		Pseudocode:
		Summarize call response history
		
		Generate variable for if ID has any success (i.e. respondent does survey)
		Then drop observations within ID that have no consent
		
		Then drop all but one observation from those that have no consent
		
		And then drop any variables that are just temporary variables for this process
		*/
		
	* Cleaning up remaining sampling errors
	drop if key=="uuid:4bbaf989-6b9d-4aa7-9281-1494bec92c6a" // Scrambledid 7851 - this interview is marked as "picked up" but no interview data
	drop if key=="uuid:e5eb441f-5d8c-4dbe-a1f9-6181f30cb78c" // Scrambledid 10454 - (see below)
	// (currently keep uuid:43a26fa9-cc26-45e1-a06b-4fea97cd5376, but almost identical interview with same person on same day - first day survey hiccups probably)

	* These are calls that were conducted on the wrong dates, so we suspect the wrong the Scrambledid was entered,
	* and thus didn't receive the expected SMS that we associate with the scrambledid before the interview
	drop if key=="uuid:abe28344-5a90-40e4-a12e-0f1feb071d39" // Scrambledid 5419
	drop if key=="uuid:7a53991d-97e3-4d6d-b9be-72e38882a148" // Scrambledid 6259
	drop if key=="uuid:4479d87e-2669-4fde-8d2b-9172d029f78a" // Scrambledid 15783 (two days after correct date, although the one we keep is still one day late)
	drop if key=="uuid:dea10d47-0848-4057-acdd-c81d39afb699" // Scrambledid 17125
	
	* Log whether the scrambledid had any successful calls	
	bysort scrambledid: egen id_call_success = sum(call_status==1)
	tab id_call_success
	
	* Drop unsuccessful calls if there is only one successful 
	drop if call_status != 1 & id_call_success == 1
	gisid scrambledid if id_call_success == 1
	
	* Keep latest call if there are none successes
	drop if attemptnum != attempts & id_call_success == 0
	gisid scrambledid if id_call_success == 0
	
	* Drop unsuccessful calls and non-consent for those with two successes 
	drop if call_status != 1 & id_call_success == 2
	duplicates tag scrambledid, gen(dup)
	count if duration == longestcall & attemptnum == attempts & id_call_success > 1
	count if duration != longestcall & id_call_success == 2
	bys scrambledid: egen longestcall2 = max(duration)
	drop if duration != longestcall2 & dup > 0
	drop longestcall2 
	gisid scrambledid if id_call_success == 2
	* Only keep one observation (the most recent) from the remaining scrambledid
	//drop if id_call_success != 1 & (attempts != attemptnum)
	//drop attemptnum
		
	* Redundant drop of duplicate scrambledids in case some remain
	//duplicates drop scrambledid, force // Keeps 4669 on 9.22.20 (note for testing updated code)
	//drop if missing(scrambledid)
	gisid scrambledid


***COMPLETED INTERVIEWS

	la var complete "Interview completed"
	note complete: "were you able to complete the interview fully?"
	la def complete 1 "Yes" 2 "No" 555 "Phone cut"
	la val complete complete
	tab complete, mi
	recode complete (555=.) (2=0)
	la val complete yn
		
	la var notcomplete "Reason for not completing interview"
	note notcomplete: "what is the reason for incomplete survey?"
	la def notcomplete 1 "The respondent cut the phone in the middle of the survey and didn’t pick the pho" ///
					   2 "Phone number was blocked and became invalid in the second attempt" ///
					   3 "The respondent denied to participate further" ///
					   4 "Interview had to be cut short and will call again for the rest of the interview" ///
					   555 "Phone cut"
	la val notcomplete notcomplete
	tab notcomplete, mi

	la var notcomplete_dttme "Date and Time"
	note notcomplete_dttme: "Date and Time"

	la var surveyor_c "Please write here if you have any comments on this interview"
	note surveyor_c: "Please write here if you have any comments on this interview"

tempfile surveyrawdata
save `surveyrawdata', replace
		
********************************************************************************
*5. Merge raw survey data with treatment assignments
******************************************************************************** 

use `surveyrawdata', clear

merge 1:1 scrambledid using `treatment'
//matched = 12,798
//unmatched = 609 (1 from master and 608 from using)

	***Checks for resolving unmerged data
	
		tab scrambledid actualsurveydate if _merge==1
		tab scrambledid dateofdelivery if _merge==1
		tab call_status if _merge==1
		tab respond if _merge==1
		tab complete if _merge==1
		
		*Convert observations that were in the survey sample but not in the treatment assignment to control groups
		replace treatment = 0 if _merge == 1
		
		*Check for observations that are in the treatment assignment sample but not in the survey data (they can potentially still be called)
		tab treatmentfilename if _merge==2
		tab treatment if _merge==2

		gen notcalledyet = 1 if _merge==2 
		tab notcalledyet, mi

		/*
			*Export observations that have not been called yet but are in the treatment assignment files.
			if "`c(username)'" == "girijabahety" {
						export excel treatmentfilename scrambledid treatment behavior behavior_interview using "$logfiles\notcalledyet_`c(current_date)'" ///
						if notcalledyet==1 & ~missing(treatmentfilename), replace first(var)
			}
			if "`c(username)'" == "jimmy" {
						export excel treatmentfilename scrambledid treatment behavior behavior_interview using "$logfiles/notcalledyet_`c(current_date)'" ///
						if notcalledyet==1 & ~missing(treatmentfilename), replace first(var)
			}
		*/
	
		*assert _merge==3
		drop if _merge == 2
		*drop _merge
		
	***Check for interview dates (as per the sample assigned)
	
		tab dateofinterview, mi
		* Jimmy Note 9.23.2020: Fixed to work dynamically for day that the code is run
		local datetoday = td(`c(current_date)'')
		di "`datetoday'"
		gen interviewedearlier = 1 if dateofinterview > `datetoday'
		tab interviewedearlier, mi
		
		/*
		JIMMY NOTE: Commenting out because study is over and logging no longer works, but preserving for later review if needed.
		
			*Export observations that were called earlier than their assigned interview dates to an excel file for removal from the future survey sample
			if "`c(username)'" == "girijabahety" {
				export excel scrambledid dateofinterview using "$logfiles\calledearlier_`c(current_date)'" if interviewedearlier==1, replace first(var)
			}
			if "`c(username)'" == "jimmy" {
				export excel scrambledid dateofinterview using "$logfiles\calledearlier_`c(current_date)'" if interviewedearlier==1, replace first(var)
			}
			
			*Convert these interviews done earlier as controls
				replace treatment = 0 if interviewedearlier == 1
		*/	
	
	
	***Check for behaviour assignment for all observations
	
		tab treatment behavior, m
		tab dateofdelivery if treatment==. & behavior !=9
		tab dateofinterview if treatment==. & behavior !=9
		tab behavior behavior_interview if missing(treatment)

	*** JIMMY NOTE 9.24.2020: Defining all missing treatments that made it through cleaning as controls, which should be the correct assignment
	replace treatment = 0 if treatment == .
	
		***Check for observations that have not been surveyed but should have been surveyed 
		tab treatment if starttime=="" // None currently
	
	*** Reset SMS treatment arm to "control" for observations where we intended to SMS but didn't due to technical issues
	replace behavior = 9 if treatment==0
	
	
********************************************************************************
*6. Clean - rename, label, check for outliers

	*labeling codes pulled from the JPAL .do file
******************************************************************************** 
	
	***TRACK DROP-TIMING
	
	*Starting point is consent
	
	*Drop during 1 - characteristics
	gen phonecut_sec1 = .
	replace phonecut_sec1 = 0 if consent == 1
	replace phonecut_sec1 = 1 if consent == 1 & occupation == 555
	la var phonecut_sec1 "Phone Cut During Section 1: Personal Characteristics"
	
	*Drop during 2 - COVID knowledge and behavior
	gen phonecut_sec2 = .
	replace phonecut_sec2 = 0 if consent == 1
	replace phonecut_sec2 = 1 if consent == 1 & (protection_use == "555" | protection2 == "555")
	la var phonecut_sec2 "Phone Cut During Section 2: COVID Knowledge and Behavior"
	
	*Drop during 3 - COVID-related specific questions
	gen phonecut_sec3 = .
	replace phonecut_sec3 = 0 if consent == 1
	replace phonecut_sec3 = 1 if consent == 1 & (covering_comm == 555)
	la var phonecut_sec3 "Phone Cut During Section 3: COVID-related specific questions"
	
	*Drop during 4 - SMS-related questions
	gen phonecut_sec4 = .
	replace phonecut_sec4 = 0 if consent == 1 & (smsfrequency !=.)
	replace phonecut_sec4 = 1 if consent == 1 & (smsfrequency == 555)
	la var phonecut_sec4 "Phone Cut During Section 4: SMS-related questions"
	
	*Drop during 5 - Risk Perceptions
	gen phonecut_sec5 = .
	replace phonecut_sec5 = 0 if consent == 1 & (die_covid !=.)
	replace phonecut_sec5 = 1 if consent == 1 & (die_covid == 555)
	la var phonecut_sec5 "Phone Cut During Section 5: Risk Perceptions"
	
	*Drop during 6 - MC Scale
	*Note: Have to loop through, because phone cut is only recorded for the randomly assigned 3 MC questions
	gen phonecut_sec6 = .
	forvalues i=1/13 {
				replace phonecut_sec6 = 0 if consent == 1 & mc`i'!=.
			}
	forvalues i=1/13 {
				replace phonecut_sec6 = 1 if consent == 1 & mc`i' == 555
			}		
	la var phonecut_sec6 "Phone Cut During Section 6: MC Scale"
	
	*Drop during 7 - Health Seeking
	gen phonecut_sec7 = .
	replace phonecut_sec7 = 0 if consent == 1 & (child !=.)
	replace phonecut_sec7 = 1 if consent == 1 & (child == 555)
	la var phonecut_sec7 "Phone Cut During Section 7: Health Seeking"
	
	*Drop during 8 - HH Characteristics
	gen phonecut_sec8 = .
	replace phonecut_sec8 = 0 if consent == 1 & (complete !=.)
	replace phonecut_sec8 = 1 if consent == 1 & (complete == 555)
	la var phonecut_sec8 "Phone Cut During Section 8: HH Characteristics"
	
	***INTERVIEW-SPECIFIC VARIABLES
	
		la var key "Unique submission ID"
		cap la var submissiondate "Date/time submitted"
		cap la var formdef_version "Form version used on device"
		cap la var review_status "Review status"
		cap la var review_comments "Comments made during review"
		cap la var review_corrections "Corrections made during review"
		la var starttime "Start time of the interview"
		la var endtime "End time of the interview"
		la var surveyor_name "Select your name"
		
		la var surv_num "Surveyor Mobile Number"
		la def surv_num 1 "Primary Number" 2 "Alternate Number"
		la val surv_num surv_num

		*la var surv_num_oth "Alternate Number"

		la var exotel_ph_num "Phone Number"
		la def exotel_ph_num 1 "8047189759" 2 "7314855697"
		la val exotel_ph_num exotel_ph_num
		
		tab response
		drop response
		
		la var language "Language for the interview"
		note language: "Which language would you prefer for this call?"
		la def language 1 "Hindi" 2 "Bhojpuri" 3 "English" 555 "Phone cut"
		la val language language
		tab language, mi
		//majority Hindi
		
	***IDENTIFIERS
	
		la var scrambledid "scramble_id"
		*note scramble_id: "scramble_id"
		unique scrambledid
		*note scrambledid: "Observations are not uniquely identified"
		
drop _merge
		
tempfile cleaneddata
save `cleaneddata', replace		

********************************************************************************
*7. Merge with delivery reports for SMS to check compliance
******************************************************************************** 	

*Load Delivery Reports
use "$deliveryreports/covidstudy_deliveryreport_merged_21oct2020_deid.dta", clear

*Count number of texts attempted
bysort scrambledid: gen sms_send_attempts = _N
la var sms_send_attempts "Delivery Reports: Number of SMS Send Attempts"
summ sms_send_attempts
//mean = 10

*Make dummy variables for status
tab status, gen(stat)
local stats `""Delivered" "Failed" "Not delivered" "Sent""'
forval i=1/4 {
	local stat : word `i' of `stats'
	la var stat`i' "SMS - `stat'"
	replace stat`i'=. if missing(status)
	la val stat`i' yn
	tab stat`i', mi
}
order stat1-stat4, a(status)

*Count number of different delivery statuses for each ID
bysort scrambledid: egen delivered_count = sum(stat1)
bysort scrambledid: egen failed_count = sum(stat2)
bysort scrambledid: egen notdelivered_count = sum(stat3)
bysort scrambledid: egen sent_count = sum(stat4) // A few IDs from back in August had multiple rounds of SMS's apparently. Investigate

la var delivered_count "Number of SMS delivered"
la var failed_count "Number of SMS failed"
la var notdelivered_count "Number of SMS not Delivered"
la var sent_count "Number of SMS sent"

*Get the fraction of different delivery statuses
gen delivered_frac = delivered_count / sms_send_attempts
gen failed_frac = failed_count / sms_send_attempts
gen notdelivered_frac = notdelivered_count / sms_send_attempts
gen sent_frac = sent_count / sms_send_attempts

la var delivered_frac "Delivery Reports: % SMS with status of Delivered"
la var failed_frac "Delivery Reports: % SMS with status of Failed"
la var notdelivered_frac "Delivery Reports: % SMS with status of Not Delivered"
la var sent_frac "Delivery Reports: % SMS with status of Sent"

/*

* Identify and log phone numbers that have been texted more than 4 times
		if "`c(username)'" == "girijabahety" {
			export excel scrambledid smsdatetime using "$logfiles\extraSMS_`c(current_date)'" if sms_send > 4 & scrambledid !=., replace first(var)
		}
		if "`c(username)'" == "jimmy" {
			export excel scrambledid smsdatetime using "$logfiles/extraSMS_`c(current_date)'" if sms_send > 4 & scrambledid !=., replace first(var)
		}
*/
		
		
*Keep variables that are summarized by scrambledid
keeporder scrambledid sms_send_attempts delivered_* failed_* notdelivered_* sent_*

*Keep one observation per scrambledid
duplicates drop 
drop if missing(scrambledid)
//bysort scrambledid: keep if _n == 1

merge 1:1 scrambledid using `cleaneddata'
//9,009 matched
//4291 unmatched (501 from master and 3790 from using)

tab treatment if _merge==2 //3419 control which makes sense but many treatments don't have delivery reports
tab treatmentfilename if _merge==2 //from earlier days of intervention
assert treatment == 0 | treatmentfilename == "initial_assignment_0816" if _merge == 2
drop if _merge == 1 
drop _merge

********************************************************************************
*9. DERIVED VARIABLES (POSSIBLY TRANSFER TO ANALYSIS CODE)
******************************************************************************** 
	
	replace dateofdelivery = td("17oct2020") if dateofdelivery==td(16oct2020)
	//looking at the treatment file, the last date of SMS delivery was 17oct and not 16oct; all other dates match with filenames.
	
	* Also resolving the last couple days, where plans changed relative to master list
	replace dateofinterview = td("19oct2020") if dateofinterview == td("20oct2020")
	replace dateofinterview = td("20oct2020") if dateofdelivery == td("17oct2020")
	
	gen interviewlag =  (actualsurveydate - dateofdelivery) 
	tab interviewlag, mi
	tab actualsurveydate dateofdelivery if interviewlag<0
	
	* When was respondent interviewed, compared with planned day?
	gen interviewdelta = actualsurveydate - dateofinterview
	
	
	* Assign respondents who were called before receiving an SMS to control
	replace treatment=0 if interviewlag<0
	replace behavior=0 if interviewlag<0
	
	* Also assign respondents who were previously re-assigned to control ex-post (e.g. because they were scheduled to receive an SMS but it was never sent)
	replace interviewlag = . if treatment==0
	
	tab actualsurveydate dateofdelivery if interviewlag>7
	
	* Variable for assigned recall period
	gen assignedrecall = dateofinterview - dateofdelivery
	
	* Five day recall (1 if SMS-to-interview time was 5 days, 0 if 3 days, missing if dates not available)
	gen fivedayrecall = (actualsurveydate - dateofdelivery) == 5 | (actualsurveydate - dateofdelivery) == 6 | (actualsurveydate - dateofdelivery) == 7
	replace fivedayrecall = . if dateofinterview==. | dateofdelivery==.
	la var fivedayrecall "Five Day Recall"
	tab fivedayrecall, mi
	
	gen threedayrecall = (actualsurveydate - dateofdelivery) == 3 | (actualsurveydate - dateofdelivery) == 4
	la var threedayrecall "Three Day recall"

********************************************************************************
*8. DEFINE TREATMENTS, STRATA, OUTCOMES
******************************************************************************** 

	***TREATMENTS
		
		assert !missing(treatment)
		
		***10 treatment arms + 1 control
		rename treatment treatment_arm
		tab treatment_arm, mi
		tab treatment_arm, gen(treatment_arm_)
		
		***5 frames + 1 control
		gen treatment_frame = treatment_arm 
		recode treatment_frame (6 = 1) (7 = 2) (8 = 3) (9 = 4) (10 = 5)
		la def treatment_frame 0 "Control" 1 "Neutral" 2 "Public Gain" 3 "Public Loss" 4 "Private Gain" 5 "Private Loss" 
		la val treatment_frame treatment_frame
		la var treatment_frame "Treatment message frames"
		tab treatment_frame, gen(treatment_frame_)
		
		***Gain/Loss frames + 1 control
		gen treatment_gain = 0
		replace treatment_gain = 1 if inlist(treatment_frame,1)
		replace treatment_gain = 2 if inlist(treatment_frame,2,4)
		replace treatment_gain = 3 if inlist(treatment_frame,3,5)
		la def treatment_gain 0 "Control" 1 "Neutral" 2 "Gain" 3 "Loss"
		la val treatment_gain treatment_gain
		la var treatment_gain "Treatment message neutral or gain or loss framing"
		tab treatment_gain, gen(treatment_gain_)
		
		***2 frequencies + 1 control
		gen treatment_timing = treatment_arm
		recode treatment_timing (1/5 = 1) (6/10 = 2)
		la def treatment_timing 0 "Control" 1 "Twice Morning" 2 "Morning/Evening"
		la val treatment_timing treatment_timing
		la var treatment_timing "Treatment message timing"
		tab treatment_timing, gen(treatment_timing_)
		
		***1 pooled treatment + 1 control
		gen byte treatment_pooled = treatment_arm > 0
		replace treatment_pooled = . if missing(treatment_arm)
		la def treatment_pooled 0 "Control" 1 "Treatment"
		la val treatment_pooled treatment_pooled
		la var treatment_pooled "Treatment"
		
	***BEHAVIORS
		
		***2 treatment behaviors + 1 control
		tab behavior, mi
		recode behavior (9 = 0)
		la def behavior 0 "Control" 1 "Distancing" 2 "Handwashing", replace
		rename behavior behavior_treatment
		la var behavior_treatment "Treatment target behavior and Control"
		
		tab behavior_treatment treatment_pooled, mi
		replace behavior_treatment = 0 if treatment_pooled==0 & behavior_treatment>0
		
		***Controls randomly assigned behaviors for interview - 2 controls by behavior
		rename behavior_interview behavior
		la var behavior "Behavior"
		tab behavior, mi nol
		
		tab behavior behavior_treatment, mi
		tab behavior treatment_pooled, mi
		
		gen behavior_sd = behavior==1
		la var behavior_sd "Behavior - Social distancing"
		gen behavior_hw = behavior==2
		la var behavior_hw "Behavior - Handwashing"
		
	***Rounds
		tab dateofdelivery, mi
		gen round = dateofdelivery
		la var round "Treatment Round"
		
	***Survey periods
		gen surveydate = date(submissiondate, "MDYhm#")
		format surveydate %td
	
	***Create day of the week dummies
	cap drop dayofinterview dayint
	gen dayofinterview = dow(actualsurveydate)
	la def dayofinterview 0"Sunday" 1"Monday" 2"Tuesday" 3"Wednesday" 4"Thursday" 5"Friday" 6"Saturday"
	la val dayofinterview dayofinterview
	count if missing(dayofinterview)
	la var dayofinterview "Day of interview"
	
	***Create block id
	encode block, gen(blockid)
	
	***Create roundofinterview variables from the survey distribution (pooling by the recall period)
	gen roundofinterview = 1 if dateofinterview>=td(16aug2020) & dateofinterview<=td(21aug2020)
	replace roundofinterview = 2 if dateofinterview>=td(22aug2020) & dateofinterview<=td(27aug2020)
	replace roundofinterview = 3 if dateofinterview>=td(28aug2020) & dateofinterview<=td(01sep2020)
	replace roundofinterview = 4 if dateofinterview>=td(02sep2020) & dateofinterview<=td(05sep2020)
	replace roundofinterview = 5 if dateofinterview>=td(06sep2020) & dateofinterview<=td(09sep2020)
	replace roundofinterview = 6 if dateofinterview>=td(10sep2020) & dateofinterview<=td(12sep2020)
	replace roundofinterview = 7 if dateofinterview>=td(13sep2020) & dateofinterview<=td(17sep2020)
	replace roundofinterview = 8 if dateofinterview>=td(18sep2020) & dateofinterview<=td(23sep2020)
	replace roundofinterview = 9 if dateofinterview>=td(24sep2020) & dateofinterview<=td(26sep2020)
	replace roundofinterview = 10 if dateofinterview>=td(27sep2020) & dateofinterview<=td(02oct2020)
	replace roundofinterview = 11 if dateofinterview>=td(03oct2020) & dateofinterview<=td(08oct2020)
	replace roundofinterview = 12 if dateofinterview>=td(09oct2020) & dateofinterview<=td(12oct2020)
	replace roundofinterview = 13 if dateofinterview>=td(13oct2020) & dateofinterview<=td(16oct2020)
	replace roundofinterview = 14 if dateofinterview>=td(17oct2020) & dateofinterview<=td(20oct2020)
	la var roundofinterview "Interview Round"
	
	***Create enumerator id
	encode surveyor_name, gen(enumerator)
	la var enumerator "Enumerator ID"
	drop surveyor_name

	//// SOME DELIVERY REPORTS CHECKS THAT STILL NEED TO BE DONE
	
	* Controls that received SMS - check
	tab treatment_arm sms_send_attempts
	
	* Should have received SMS but did not (looks like we just don't have deliveries for last week of data loaded?)
	tab surveyfilename if sms_send_attempts==. & treatment_arm!=0
	
*Label data
label data "First stage of cleaning processed"
	
*Save the cleaned data
gisid scrambledid
save "$covidclean/SMS_merged_data.dta", replace
